Code
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"Research Question: What does the job market look like for Business Analytics, Data Science, and ML professionals in 2024?
What we’re Looking into: 1. Who’s hiring? → Top industries (barplot) and companies (treemap) 2. What roles exist? → Job titles within our occupation categories 3. What do they say? → Word cloud from job descriptions 4. What skills do they want? → Radar charts for each occupation + software skills barplot 5. What drives salary? → Salary by remote work type 6. Where are jobs posted? → Source types analysis
Each insight builds toward our ML modeling decisions.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from wordcloud import WordCloud
import re
pd.set_option('display.max_columns', None)
# Load data
df = pd.read_csv('data/lightcast_cleaned.csv')
df['POSTED'] = pd.to_datetime(df['POSTED'])
# Exclude unclassified/unknown values for cleaner analysis
df_clean = df[~df['NAICS_2022_2_NAME'].str.contains('Unclassified', na=False)].copy()
df_clean = df_clean[~df_clean['COMPANY_NAME'].str.contains('Unclassified', case=False, na=False)].copy()
df_clean = df_clean[~df_clean['TITLE_NAME'].str.contains('Unclassified', case=False, na=False)].copy()
print(f"Dataset: {len(df):,} job postings")
print(f"After filtering unclassified: {len(df_clean):,} job postings")
print(f"Date range: {df['POSTED'].min().strftime('%b %Y')} - {df['POSTED'].max().strftime('%b %Y')}")
print(f"Occupations: {df['LOT_V6_OCCUPATION_NAME'].nunique()}")Dataset: 55,917 job postings
After filtering unclassified: 46,713 job postings
Date range: May 2024 - Sep 2024
Occupations: 4
# Top 10 Industries - Horizontal Bar Plot
industry_counts = df_clean['NAICS_2022_2_NAME'].value_counts().head(10)
fig = go.Figure()
fig.add_trace(go.Bar(
y=industry_counts.index,
x=industry_counts.values,
orientation='h',
marker_color='steelblue',
text=industry_counts.values,
textposition='outside'
))
fig.update_layout(
title='Top 10 Industries Hiring Data Professionals',
xaxis_title='Number of Job Postings',
yaxis={'categoryorder': 'total ascending'},
template='plotly_white',
height=450,
margin=dict(r=80)
)
fig.write_image('figures/top_industries.png', scale=2)
fig.show()
# Top 10 Companies - Treemap visualization
company_counts = df_clean['COMPANY_NAME'].value_counts().head(10).reset_index()
company_counts.columns = ['Company', 'Postings']
fig2 = px.treemap(
company_counts,
path=['Company'],
values='Postings',
title='Top 10 Companies Hiring Data Professionals',
color='Postings',
color_continuous_scale='Viridis'
)
fig2.update_layout(height=500, template='plotly_white')
fig2.update_traces(textinfo='label+value+percent root')
fig2.write_image('figures/top_companies_treemap.png', scale=2)
fig2.show()
print("\nTop 10 Companies by Job Postings:")
for i, row in company_counts.iterrows():
pct = row['Postings'] / len(df_clean) * 100
print(f" {i+1}. {row['Company']}: {row['Postings']:,} postings ({pct:.1f}%)")
Top 10 Companies by Job Postings:
1. Deloitte: 2,271 postings (4.9%)
2. Accenture: 1,316 postings (2.8%)
3. PricewaterhouseCoopers: 697 postings (1.5%)
4. Insight Global: 355 postings (0.8%)
5. Cardinal Health: 346 postings (0.7%)
6. Smx Corporation Limited: 317 postings (0.7%)
7. Oracle: 311 postings (0.7%)
8. Robert Half: 308 postings (0.7%)
9. Randstad: 285 postings (0.6%)
10. Lumen Technologies: 267 postings (0.6%)
occupations = df_clean['LOT_V6_OCCUPATION_NAME'].unique()
colors = {'Data / Data Mining Analyst': '#1f77b4',
'Business Intelligence Analyst': '#ff7f0e',
'Business / Management Analyst': '#2ca02c',
'Market Research Analyst': '#d62728'}
fig = make_subplots(
rows=2, cols=2,
subplot_titles=[f"{occ}" for occ in occupations],
horizontal_spacing=0.15,
vertical_spacing=0.25
)
for idx, occ in enumerate(occupations):
row = idx // 2 + 1
col = idx % 2 + 1
df_occ = df_clean[df_clean['LOT_V6_OCCUPATION_NAME'] == occ]
df_occ = df_occ[~df_occ['TITLE_NAME'].str.contains('Unclassified', case=False, na=False)]
top_titles = df_occ['TITLE_NAME'].value_counts().head(8)
truncated_titles = [t[:28] + '...' if len(t) > 28 else t for t in top_titles.index]
fig.add_trace(
go.Bar(
y=truncated_titles,
x=top_titles.values,
orientation='h',
marker_color=colors.get(occ, '#636EFA'),
text=top_titles.values,
textposition='outside',
name=occ,
showlegend=False
),
row=row, col=col
)
fig.update_xaxes(title_text="Count", row=row, col=col)
fig.update_yaxes(categoryorder='total ascending', row=row, col=col)
fig.update_layout(
title_text='Top 8 Job Titles per Occupation Category (Excl. Unclassified)',
height=800,
template='plotly_white',
margin=dict(l=20, r=80, t=80, b=20)
)
fig.write_image('figures/job_titles_by_occupation.png', scale=2)
fig.show()
print("\nPostings per Occupation:")
for occ, count in df_clean['LOT_V6_OCCUPATION_NAME'].value_counts().items():
print(f" • {occ}: {count:,} ({count/len(df_clean)*100:.1f}%)")
Postings per Occupation:
• Data / Data Mining Analyst: 22,352 (47.8%)
• Business Intelligence Analyst: 21,244 (45.5%)
• Business / Management Analyst: 2,999 (6.4%)
• Market Research Analyst: 118 (0.3%)
# Using a sample of 3000 rows to prevent crashes
custom_stopwords = {
'the', 'and', 'to', 'of', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this',
'with', 'are', 'be', 'as', 'at', 'from', 'or', 'an', 'will', 'your', 'you',
'we', 'our', 'have', 'has', 'it', 'their', 'all', 'can', 'been', 'would',
'who', 'more', 'if', 'about', 'which', 'when', 'what', 'into', 'also',
'may', 'other', 'its', 'than', 'should', 'such', 'any', 'these', 'only',
'new', 'well', 'them', 'they', 'but', 'not', 'do', 'up', 'out', 'so',
'job', 'position', 'apply', 'applicant', 'employer', 'employment',
'equal', 'opportunity', 'eeo', 'affirmative', 'action', 'disability',
'race', 'color', 'religion', 'sex', 'national', 'origin', 'age',
'status', 'protected', 'discrimination', 'including', 'without', 'regard',
'com', 'www', 'http', 'https', 'click', 'here', 'learn', 'please', 'contact',
'must', 'work', 'working', 'experience', 'years', 'year', 'required',
'requirements', 'skills', 'ability', 'strong', 'excellent', 'good',
'team', 'company', 'business', 'including', 'within', 'across', 'using'
}
df_sample = df_clean.sample(n=min(3000, len(df_clean)), random_state=42)
body_text = ' '.join(df_sample['BODY'].dropna().astype(str).tolist())
body_text = re.sub(r'[^a-zA-Z\s]', ' ', body_text.lower())
body_text = re.sub(r'\s+', ' ', body_text)
wordcloud = WordCloud(
width=1200,
height=600,
background_color='white',
stopwords=custom_stopwords,
max_words=100,
colormap='viridis',
collocations=False,
random_state=42
).generate(body_text)
plt.figure(figsize=(14, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud: Common Terms in Job Descriptions', fontsize=18, fontweight='bold', pad=15)
plt.tight_layout()
plt.savefig('figures/job_description_wordcloud.png', dpi=200, bbox_inches='tight')
plt.show()
print(f"Word cloud generated from {len(df_sample):,} sampled job postings")
Word cloud generated from 3,000 sampled job postings
def extract_skills(skills_series):
all_skills = []
for skills in skills_series.dropna():
if isinstance(skills, str) and skills not in ['Not Listed', '']:
all_skills.extend([s.strip() for s in skills.split(',')])
return Counter(all_skills)
# Shorten skill names for better display
def shorten_skill(skill):
replacements = {
'SQL (Programming Language)': 'SQL',
'Microsoft Excel': 'Excel',
'Microsoft Power BI': 'Power BI',
'Computer Science': 'Computer Sci',
'Problem Solving': 'Problem Solving',
'Data Visualization': 'Data Viz',
'Business Intelligence': 'Business Intel',
'Project Management': 'Project Mgmt',
'Data Management': 'Data Mgmt',
'Business Development': 'Business Dev',
'Customer Service': 'Customer Svc',
'Marketing Strategy': 'Mktg Strategy',
'Market Research': 'Market Research',
'Statistical Analysis': 'Statistics'
}
return replacements.get(skill, skill[:14] + '..' if len(skill) > 14 else skill)
occ_short = {
'Business Intelligence Analyst': 'Business Intelligence',
'Data / Data Mining Analyst': 'Data/Data Mining',
'Business / Management Analyst': 'Business/Management',
'Market Research Analyst': 'Market Research'
}
colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
occupations = df_clean['LOT_V6_OCCUPATION_NAME'].unique().tolist()
# Get top 8 skills for EACH occupation separately
skill_data = {}
for occ in occupations:
df_occ = df_clean[df_clean['LOT_V6_OCCUPATION_NAME'] == occ]
occ_skills = extract_skills(df_occ['SKILLS_NAME'])
top_8 = occ_skills.most_common(8)
total = len(df_occ)
skill_data[occ] = {
'skills': [s[0] for s in top_8],
'values': [(s[1] / total * 100) for s in top_8]
}
fig = make_subplots(
rows=2, cols=2,
specs=[[{'type': 'polar'}, {'type': 'polar'}],
[{'type': 'polar'}, {'type': 'polar'}]],
subplot_titles=[occ_short.get(occ, occ) for occ in occupations],
vertical_spacing=0.18,
horizontal_spacing=0.12
)
for idx, occ in enumerate(occupations):
row = idx // 2 + 1
col = idx % 2 + 1
skills = skill_data[occ]['skills']
values = skill_data[occ]['values']
# Shorten labels and close the polygon
short_labels = [shorten_skill(s) for s in skills]
values_closed = values + [values[0]]
labels_closed = short_labels + [short_labels[0]]
fig.add_trace(
go.Scatterpolar(
r=values_closed,
theta=labels_closed,
fill='toself',
name=occ_short.get(occ, occ),
line_color=colors_list[idx],
fillcolor=colors_list[idx],
opacity=0.5,
showlegend=False
),
row=row, col=col
)
# Update polar subplots
fig.update_polars(
radialaxis=dict(
visible=True,
tickfont=dict(size=9),
range=[0, 85]
),
angularaxis=dict(
tickfont=dict(size=10),
rotation=90,
direction='clockwise'
)
)
fig.update_layout(
title=dict(
text='Top 8 Skills by Occupation (Each with Own Top Skills)',
font=dict(size=16),
y=0.98
),
height=750,
template='plotly_white',
margin=dict(t=80, b=30, l=60, r=60)
)
# Adjust subplot title positions
for annotation in fig['layout']['annotations']:
annotation['font'] = dict(size=12, color='#333')
annotation['y'] = annotation['y'] + 0.02
fig.write_image('figures/skills_radar_by_occupation.png', scale=2)
fig.show()
# Print the top skills for each occupation
print("\nTop 8 Skills by Occupation:")
for occ in occupations:
print(f"\n{occ_short.get(occ, occ)}:")
for skill, val in zip(skill_data[occ]['skills'], skill_data[occ]['values']):
print(f" • {skill}: {val:.1f}%")
# Software skills - Horizontal Bar Chart
software_counts = extract_skills(df_clean['SOFTWARE_SKILLS_NAME'])
top_software = pd.DataFrame(software_counts.most_common(12), columns=['Software', 'Count'])
top_software['Percentage'] = (top_software['Count'] / len(df_clean) * 100).round(1)
fig2 = go.Figure()
fig2.add_trace(go.Bar(
y=top_software['Software'],
x=top_software['Percentage'],
orientation='h',
marker_color='mediumpurple',
text=[f"{p}%" for p in top_software['Percentage']],
textposition='outside'
))
fig2.update_layout(
title='Top 12 Software/Technical Skills (% of Postings)',
xaxis_title='% of Job Postings',
yaxis={'categoryorder': 'total ascending'},
template='plotly_white',
height=450,
margin=dict(r=80)
)
fig2.write_image('figures/software_skills.png', scale=2)
fig2.show()
Top 8 Skills by Occupation:
Business Intelligence:
• Communication: 43.7%
• SAP Applications: 35.8%
• Management: 35.7%
• Business Process: 29.5%
• Business Requirements: 26.7%
• Problem Solving: 25.4%
• Finance: 23.4%
• Consulting: 23.1%
Data/Data Mining:
• Data Analysis: 76.5%
• SQL (Programming Language): 51.6%
• Communication: 45.4%
• Management: 33.8%
• Python (Programming Language): 31.2%
• Tableau (Business Intelligence Software): 30.7%
• Microsoft Excel: 28.3%
• Dashboard: 27.9%
Business/Management:
• Communication: 51.2%
• Management: 39.2%
• Leadership: 34.2%
• Operations: 33.8%
• Microsoft Excel: 32.0%
• Problem Solving: 30.9%
• Project Management: 29.1%
• Presentations: 27.5%
Market Research:
• Customer Relationship Management: 83.1%
• Business Process: 45.8%
• Communication: 41.5%
• Business Requirements: 41.5%
• Salesforce: 40.7%
• Project Management: 40.7%
• Sales: 39.8%
• Problem Solving: 38.1%
# Salary Analysis - Violin Plot by Remote Work Type and Occupation
df_remote = df_clean[df_clean['REMOTE_TYPE_NAME'] != 'Not Specified'].copy()
fig = px.violin(
df_remote,
x='REMOTE_TYPE_NAME',
y='SALARY',
color='LOT_V6_OCCUPATION_NAME',
box=True,
title='Salary Distribution by Remote Work Type and Occupation',
labels={
'REMOTE_TYPE_NAME': 'Remote Work Type',
'SALARY': 'Annual Salary ($)',
'LOT_V6_OCCUPATION_NAME': 'Occupation'
}
)
fig.update_layout(
template='plotly_white',
height=500,
legend=dict(orientation='h', yanchor='bottom', y=-0.3)
)
fig.write_image('figures/salary_by_remote.png', scale=2)
fig.show()
print("\nSalary Statistics by Remote Type:")
salary_stats = df_remote.groupby('REMOTE_TYPE_NAME')['SALARY'].agg(['count', 'median', 'mean', 'std']).round(0)
salary_stats.columns = ['Count', 'Median', 'Mean', 'Std Dev']
salary_stats['Median'] = salary_stats['Median'].apply(lambda x: f"${x:,.0f}")
salary_stats['Mean'] = salary_stats['Mean'].apply(lambda x: f"${x:,.0f}")
salary_stats['Std Dev'] = salary_stats['Std Dev'].apply(lambda x: f"${x:,.0f}")
print(salary_stats)
Salary Statistics by Remote Type:
Count Median Mean Std Dev
REMOTE_TYPE_NAME
Hybrid Remote 1482 $95,300 $104,846 $28,957
Not Remote 688 $97,250 $104,763 $27,970
Remote 7875 $98,800 $109,097 $29,321
def extract_sources(source_series):
all_sources = []
for sources in source_series.dropna():
if isinstance(sources, str):
all_sources.extend([s.strip() for s in sources.split(',') if s.strip()])
return Counter(all_sources)
source_counts = extract_sources(df_clean['SOURCE_TYPES'])
cleaned_counts = {}
for source, count in source_counts.items():
if source == 'NONE' or source == '':
continue
elif source == 'Job intermediary':
cleaned_counts['Recruiter'] = cleaned_counts.get('Recruiter', 0) + count
elif source =='FreeJobBoard':
cleaned_counts['Job Board'] = cleaned_counts.get('Job Board', 0) + count
else:
cleaned_counts[source] = cleaned_counts.get(source, 0) + count
top_sources = pd.DataFrame(list(cleaned_counts.items()), columns=['Source', 'Count'])
top_sources = top_sources.sort_values('Count', ascending=False).head(10).reset_index(drop=True)
top_sources['Percentage'] = (top_sources['Count'] / len(df_clean) * 100).round(1)
fig = px.treemap(
top_sources,
path=['Source'],
values='Count',
title='Job Posting Sources Distribution',
color='Count',
color_continuous_scale='Blues'
)
fig.update_layout(template='plotly_white', height=500)
fig.update_traces(textinfo='label+value+percent root', textfont_size=14)
fig.write_image('figures/source_types.png', scale=2)
fig.show()
print("\nJob Posting Sources:")
for i, row in top_sources.iterrows():
print(f" {i+1}. {row['Source']}: {row['Count']:,} ({row['Percentage']}%)")
Job Posting Sources:
1. Job Board: 38,924 (83.3%)
2. Company: 13,318 (28.5%)
3. Recruiter: 2,791 (6.0%)
4. Government: 862 (1.8%)
5. Education: 521 (1.1%)
Our exploratory analysis of 46,700+ job postings reveals key insights for data analytics professionals:
Industry & Companies: Professional Services, administrative support, waste management and Finance dominate hiring, with Deloitte, Accenture, pricewaterhouse and Insight Global leading recruitment—consulting firms as the primary employers.
Skills Demand: Each occupation has distinct skill requirements. Data Analysts prioritize SQL and Data Analysis, while Business Analysts focus on Communication and Project Management. Market Research emphasizes Marketing and Customer insights.
Compensation: Median salaries range $95K-$99K across remote types, with remote positions slightly higher. Occupation type impacts salary more than remote status.
Job Sources: 83% of postings appear on Job Boards, making platforms like LinkedIn and Indeed essential for job seekers. Direct company applications (28%) remain valuable.